{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Requirements:\n",
    "    \n",
    "- Python version: Python 3.8.3\n",
    "- numpy version 1.19.2\n",
    "- pandas version 1.1.3"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Merge all Age Data Together\n",
    "Create the file ‘inventor_age_score_gender.csv'\n",
    "\n",
    "Required files:\n",
    "\n",
    "- 'target_out.csv'\n",
    "- 'newinventor_with_birthYear.csv'\n",
    "- 'newinventor_no_birthYear.csv'\n",
    "- 'newinventor_with_birthYearNA.csv'\n",
    "- 'newinventor_gender.csv'\n",
    "- 'us_inventor_spokeo_radaris_beenverified_with_patent_id.csv'\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/Users/mkaltenberg/Dropbox/Inventor_age/replication_data/inventor_age\n"
     ]
    }
   ],
   "source": [
    "cd \"your/folder/here/\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "8080135"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "inventor = pd.read_csv('us_inventor_spokeo_radaris_beenverified_with_patent_id.csv')\n",
    "# Inventor ids relate to disamb_inventor_id_20181127 dataset\n",
    "len(inventor)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "3648663"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#number of unique patents \n",
    "inventor['patent_id'].nunique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "8080135"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#cleaning it up\n",
    "inventor =  inventor.rename(columns = {'inventor_first_name_x':'inventor_first_name','inventor_city_x':'inventor_city', 'inventor_state_x':'inventor_state', \n",
    "                           'inventor_last_name_x': 'inventor_last_name', 'inventor_id_x':'inventor_id'}).drop('Unnamed: 0',1)\n",
    "len(inventor)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "#second dataset to connect\n",
    "inventor_n = pd.read_csv(\"target_out.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "## only include revlevant new information\n",
    "inventor_n = inventor_n[['inventor_id', 'peoplefinders_age', 'peoplefinders_score']]\n",
    "inventor_n['peoplefinders_age'] = inventor_n['peoplefinders_age'].astype(int)\n",
    "inventor_n['peoplefinders_score'] = inventor_n['peoplefinders_score'].astype(int)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "inventor = pd.merge(inventor,inventor_n, on= 'inventor_id', how='left').fillna(-10)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Create Age Data based on Heuristics\n",
    "\n",
    "Identify different levels of age agreemet between our sources.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Including relevant information\n",
    "stats = inventor[['inventor_id', 'inventor_city', 'inventor_state','radaris_age', 'spokeo_age', \n",
    "                  'been_age','radaris_score','spokeo_score','been_score', 'peoplefinders_age','peoplefinders_score']].drop_duplicates()\n",
    "#replace all negative values of age with a negative number for a particular source ( radaris, -1, spokeo -2, beenverified -3 ad peoplefinders -4)\n",
    "stats['radaris_age'] = np.where(stats['radaris_age']<0, -1, stats['radaris_age'])\n",
    "stats['spokeo_age'] = np.where(stats['spokeo_age']<0, -2, stats['spokeo_age'])\n",
    "stats['been_age'] = np.where(stats['been_age']<0, -3, stats['been_age'])\n",
    "stats['peoplefinders_age'] = np.where(stats['peoplefinders_age']<0, -4, stats['peoplefinders_age'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "#ensure everything is in integers\n",
    "direcs = ['spokeo_age','radaris_age','been_age','peoplefinders_age','peoplefinders_score']\n",
    "for i in direcs:\n",
    "    stats[i] = stats[i].astype(int)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Age agreement"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "# identify agreement depending on type of source (building indicators for these types)\n",
    "#4agree\n",
    "stats['4agree'] = np.where((stats['radaris_age' ]==stats['spokeo_age'])\n",
    "                           & (stats['spokeo_age']==stats['been_age'])\n",
    "                           & (stats['radaris_age']==stats['been_age'])\n",
    "                           & (stats['peoplefinders_age']==stats['been_age'])\n",
    "                           & (stats['peoplefinders_age']==stats['radaris_age'])\n",
    "                           & (stats['spokeo_age']==stats['peoplefinders_age']),1, 0)\n",
    "#3 agree\n",
    "stats['sbp'] = np.where((stats['peoplefinders_age']==stats['spokeo_age'])\n",
    "                           & (stats['spokeo_age']==stats['been_age'])\n",
    "                           & (stats['peoplefinders_age']==stats['been_age']) ,1, 0)\n",
    "stats['brp'] = np.where((stats['radaris_age']==stats['peoplefinders_age'])\n",
    "                           & (stats['peoplefinders_age']==stats['been_age'])\n",
    "                           & (stats['radaris_age']==stats['been_age']) ,1, 0)\n",
    "stats['spr'] = np.where((stats['radaris_age']==stats['spokeo_age'])\n",
    "                           & (stats['spokeo_age']==stats['peoplefinders_age'])\n",
    "                           & (stats['radaris_age']==stats['peoplefinders_age']) ,1, 0)\n",
    "stats['sbr'] = np.where((stats['radaris_age']==stats['spokeo_age'])\n",
    "                           & (stats['spokeo_age']==stats['been_age'])\n",
    "                           & (stats['radaris_age']==stats['been_age']) ,1, 0)\n",
    "stats['3agree'] = np.where((stats['sbp']==1) | (stats['brp']==1)| (stats['spr']==1)|(stats['sbr']==1),1,0)\n",
    "stats['3agree'] = np.where((stats['3agree']==1)&(stats['4agree']==1),0,stats['3agree'])\n",
    "\n",
    "#2 agree\n",
    "stats['sb_agree'] = np.where(stats['spokeo_age']==stats['been_age'],1, 0)\n",
    "stats['sr_agree'] = np.where(stats['spokeo_age']==stats['radaris_age'],1, 0)\n",
    "stats['br_agree'] = np.where(stats['been_age']==stats['radaris_age'],1, 0)\n",
    "\n",
    "stats['sp_agree'] = np.where(stats['spokeo_age']==stats['peoplefinders_age'],1, 0)\n",
    "stats['bp_agree'] = np.where(stats['been_age']==stats['peoplefinders_age'],1, 0)\n",
    "stats['rp_agree'] = np.where(stats['radaris_age']==stats['peoplefinders_age'],1, 0)\n",
    "\n",
    "stats['2agree'] = np.where((stats['sb_agree']==1)| (stats['sr_agree']| stats['br_agree']| (stats['br_agree'])\n",
    "                    | (stats['sp_agree']) |  stats['bp_agree'] | stats['rp_agree']) ,1,0)\n",
    "stats['2agree'] = np.where((stats['2agree']==1) & (stats['3agree']==1),0, stats['2agree'])\n",
    "stats['2agree'] = np.where((stats['2agree']==1) & (stats['4agree']==1),0, stats['2agree'])\n",
    "\n",
    "#none agree\n",
    "stats['none'] = np.where((stats['radaris_age']==-1)\n",
    "                           & (stats['spokeo_age']==-2)\n",
    "                           & (stats['been_age']==-3)\n",
    "                           & (stats['peoplefinders_age'] == -4),1, 0)\n",
    "#only one age found\n",
    "stats['s'] = np.where((stats['radaris_age']==-1)\n",
    "                           & (stats['spokeo_age']>0)\n",
    "                           & (stats['been_age']==-3)\n",
    "                           & (stats['peoplefinders_age']==-4),1, 0)\n",
    "stats['b'] = np.where((stats['radaris_age']==-1)\n",
    "                           & (stats['spokeo_age']==-2)\n",
    "                           & (stats['been_age']>0)\n",
    "                          & (stats['peoplefinders_age']==-4),1, 0)\n",
    "stats['r'] = np.where((stats['radaris_age']>0)\n",
    "                           & (stats['spokeo_age']==-2)\n",
    "                           & (stats['been_age']==-3)\n",
    "                           & (stats['peoplefinders_age']==-4),1, 0)\n",
    "stats['p'] = np.where((stats['peoplefinders_age']>0)\n",
    "                           & (stats['spokeo_age']==-2)\n",
    "                           & (stats['been_age']==-3)\n",
    "                          & (stats['radaris_age']==-1),1, 0)\n",
    "stats['only'] = np.where((stats['s']==1) & (stats['2agree']==0)& (stats['3agree']==0) & (stats['4agree']==0),1,0)\n",
    "stats['only'] = np.where((stats['b']==1) & (stats['2agree']==0)& (stats['3agree']==0) & (stats['4agree']==0),1,stats['only'])\n",
    "stats['only'] = np.where((stats['r']==1) & (stats['2agree']==0)& (stats['3agree']==0) & (stats['4agree']==0),1,stats['only'])\n",
    "stats['only'] = np.where((stats['p']==1) & (stats['2agree']==0)& (stats['3agree']==0) & (stats['4agree']==0),1,stats['only'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0\n",
      "0\n",
      "0\n",
      "0\n",
      "0\n",
      "0\n",
      "0\n",
      "0\n",
      "0\n",
      "0\n",
      "0\n",
      "0\n",
      "0\n",
      "0\n",
      "0\n",
      "0\n",
      "0\n",
      "0\n",
      "0\n",
      "0\n"
     ]
    }
   ],
   "source": [
    "#sanity check\n",
    "print(len(stats[(stats['only']==1) & (stats['4agree']==1)]))\n",
    "print(len(stats[(stats['only']==1) & (stats['3agree']==1)]))\n",
    "print(len(stats[(stats['only']==1) & (stats['2agree']==1)]))\n",
    "print(len(stats[(stats['only']==1) & (stats['none']==1)]))\n",
    "\n",
    "\n",
    "print(len(stats[(stats['none']==1) & (stats['4agree']==1)]))\n",
    "print(len(stats[(stats['none']==1) & (stats['2agree']==1)]))\n",
    "print(len(stats[(stats['none']==1) & (stats['3agree']==1)]))\n",
    "print(len(stats[(stats['none']==1) & (stats['only']==1)]))\n",
    "\n",
    "\n",
    "print(len(stats[(stats['2agree']==1) & (stats['4agree']==1)]))\n",
    "print(len(stats[(stats['2agree']==1) & (stats['3agree']==1)]))\n",
    "print(len(stats[(stats['2agree']==1) & (stats['only']==1)]))\n",
    "print(len(stats[(stats['2agree']==1) & (stats['none']==1)]))\n",
    "\n",
    "print(len(stats[(stats['3agree']==1) & (stats['4agree']==1)]))\n",
    "print(len(stats[(stats['3agree']==1) & (stats['2agree']==1)]))\n",
    "print(len(stats[(stats['3agree']==1) & (stats['only']==1)]))\n",
    "print(len(stats[(stats['3agree']==1) & (stats['none']==1)]))\n",
    "\n",
    "print(len(stats[(stats['4agree']==1) & (stats['2agree']==1)]))\n",
    "print(len(stats[(stats['4agree']==1) & (stats['3agree']==1)]))\n",
    "print(len(stats[(stats['4agree']==1) & (stats['only']==1)]))\n",
    "print(len(stats[(stats['4agree']==1) & (stats['none']==1)]))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "#dummy for only one source found\n",
    "stats['onesource'] = np.where((stats['s']==1)| (stats['b']==1) | (stats['r']==1) | (stats['p']==1), 1,0) \n",
    "stats['onesource'] = np.where((stats['sbr']==1), 0, stats['onesource'])\n",
    "stats['onesource'] = np.where((stats['sbp']==1), 0, stats['onesource'])\n",
    "stats['onesource'] = np.where((stats['brp']==1), 0, stats['onesource'])\n",
    "stats['onesource'] = np.where((stats['spr']==1), 0, stats['onesource'])\n",
    "stats['onesource'] = np.where((stats['4agree']==1), 0, stats['onesource'])\n",
    "\n",
    "#dummy for two sources found\n",
    "stats['twosources'] = np.where((stats['spokeo_age']>0) &(stats['radaris_age']>0),1, 0 )\n",
    "stats['twosources'] = np.where((stats['peoplefinders_age']>0) &(stats['radaris_age']>0),1, stats['twosources'])\n",
    "stats['twosources'] = np.where((stats['been_age']>0) &(stats['spokeo_age']>0),1, stats['twosources'])\n",
    "stats['twosources'] = np.where((stats['been_age']>0) &(stats['radaris_age']>0),1, stats['twosources'])\n",
    "stats['twosources'] = np.where((stats['been_age']>0) &(stats['peoplefinders_age']>0),1, stats['twosources'])\n",
    "stats['twosources'] = np.where((stats['spokeo_age']>0) &(stats['peoplefinders_age']>0),1, stats['twosources'])\n",
    "\n",
    "stats['twosources'] = np.where((stats['spokeo_age']>0) &(stats['peoplefinders_age']>0) & (stats['radaris_age']>0), 0, stats['twosources'])\n",
    "stats['twosources'] = np.where((stats['spokeo_age']>0) &(stats['peoplefinders_age']>0) & (stats['been_age']>0), 0, stats['twosources'])\n",
    "stats['twosources'] = np.where((stats['spokeo_age']>0) &(stats['been_age']>0) & (stats['radaris_age']>0), 0, stats['twosources'])\n",
    "stats['twosources'] = np.where((stats['been_age']>0) &(stats['peoplefinders_age']>0) & (stats['radaris_age']>0), 0, stats['twosources'])\n",
    "stats['twosources'] = np.where((stats['been_age']>0) &(stats['peoplefinders_age']>0) & (stats['radaris_age']>0) &(stats['spokeo_age']>0), 0, stats['twosources'])\n",
    "\n",
    "#dummy for 3 sources found\n",
    "stats['threesources'] = 0\n",
    "stats['threesources'] = np.where((stats['spokeo_age']>0) &(stats['peoplefinders_age']>0) & (stats['radaris_age']>0), 1, stats['threesources'])\n",
    "stats['threesources'] = np.where((stats['spokeo_age']>0) &(stats['peoplefinders_age']>0) & (stats['been_age']>0), 1, stats['threesources'])\n",
    "stats['threesources'] = np.where((stats['spokeo_age']>0) &(stats['been_age']>0) & (stats['radaris_age']>0), 1, stats['threesources'])\n",
    "stats['threesources'] = np.where((stats['been_age']>0) &(stats['peoplefinders_age']>0) & (stats['radaris_age']>0), 1, stats['threesources'])\n",
    "stats['threesources'] = np.where((stats['been_age']>0) &(stats['peoplefinders_age']>0) & (stats['radaris_age']>0) &(stats['spokeo_age']>0), 0, stats['threesources'])\n",
    "\n",
    "#dummy for 4 sources found\n",
    "stats['foursources'] = 0\n",
    "stats['foursources'] = np.where((stats['spokeo_age']>0) & (stats['peoplefinders_age']>0)& (stats['radaris_age']>0) & (stats['peoplefinders_age']>0),1,stats['foursources'])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Age based on heuristics\n",
    "\n",
    "#creating age depending on heuristic\n",
    "stats['oneage']=0\n",
    "\n",
    "# where there is no age, using -1 (noage == -1)\n",
    "stats['oneage'] = np.where(stats['none']==1, stats['radaris_age'], stats['oneage']) \n",
    "\n",
    "#use the age if only one age is found\n",
    "#all types of age categories\n",
    "stats['oneage'] = np.where(stats['s']==1, stats['spokeo_age'], stats['oneage'])\n",
    "stats['oneage'] = np.where(stats['r']==1, stats['radaris_age'], stats['oneage'])\n",
    "stats['oneage'] = np.where(stats['b']==1, stats['been_age'], stats['oneage'])\n",
    "stats['oneage'] = np.where(stats['p']==1, stats['peoplefinders_age'], stats['oneage'])\n",
    "\n",
    "#use when age agree with 2 sources (and only have 2 sources)\n",
    "stats['oneage']=np.where((stats['twosources']==1) & (stats['spokeo_age']>0), stats['spokeo_age'], stats['oneage'])\n",
    "stats['oneage']=np.where((stats['twosources']==1) & (stats['radaris_age']>0), stats['radaris_age'], stats['oneage'])\n",
    "stats['oneage']=np.where((stats['twosources']==1) & (stats['peoplefinders_age']>0), stats['peoplefinders_age'], stats['oneage'])\n",
    "stats['oneage']=np.where((stats['twosources']==1) & (stats['been_age']>0), stats['been_age'], stats['oneage'])\n",
    "\n",
    "#using age when 3 agree\n",
    "stats['oneage'] = np.where(stats['sbr']==1, stats['been_age'], stats['oneage'])\n",
    "stats['oneage'] = np.where(stats['sbp']==1, stats['been_age'], stats['oneage'])\n",
    "stats['oneage'] = np.where(stats['brp']==1, stats['been_age'], stats['oneage'])\n",
    "stats['oneage'] = np.where(stats['spr']==1, stats['radaris_age'], stats['oneage'])\n",
    "\n",
    "# Cases where only 3 ages were found, but at least 2 agree\n",
    "stats['oneage'] = np.where((stats['sb_agree']==1) & (stats['threesources']==1) & (stats['foursources']==0), stats['been_age'], stats['oneage'])\n",
    "stats['oneage'] = np.where((stats['sr_agree']==1) & (stats['threesources']==1) & (stats['foursources']==0), stats['spokeo_age'], stats['oneage'])\n",
    "stats['oneage'] = np.where((stats['br_agree']==1) & (stats['threesources']==1) & (stats['foursources']==0), stats['been_age'], stats['oneage'])\n",
    "stats['oneage'] = np.where((stats['sp_agree']==1) & (stats['threesources']==1) & (stats['foursources']==0), stats['spokeo_age'], stats['oneage'])\n",
    "stats['oneage'] = np.where((stats['bp_agree']==1) & (stats['threesources']==1) & (stats['foursources']==0), stats['been_age'], stats['oneage'])\n",
    "stats['oneage'] = np.where((stats['rp_agree']==1) & (stats['threesources']==1) & (stats['foursources']==0), stats['radaris_age'], stats['oneage'])\n",
    "\n",
    "#where 3 ages agree, but not all 4 agree\n",
    "stats['oneage'] = np.where((stats['sbr']==1) & (stats['threesources']==0) & (stats['foursources']==1), stats['radaris_age'], stats['oneage'])\n",
    "stats['oneage'] = np.where((stats['sbp']==1) & (stats['threesources']==0) & (stats['foursources']==1), stats['been_age'], stats['oneage'])\n",
    "stats['oneage'] = np.where((stats['brp']==1) & (stats['threesources']==0) & (stats['foursources']==1), stats['radaris_age'], stats['oneage'])\n",
    "stats['oneage'] = np.where((stats['spr']==1) & (stats['threesources']==0) & (stats['foursources']==1), stats['radaris_age'], stats['oneage'])\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Age disagreement"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "#identifying subgroups of data\n",
    "# 3 different ages and all 3 ages differ\n",
    "stats['sbr_disagree_all']= 0\n",
    "stats['sbr_disagree_all'] = np.where((stats['radaris_age']>0) &(stats['been_age']>0) &(stats['spokeo_age']>0) &(stats['radaris_age']!=stats['spokeo_age']) & (stats['been_age']!=stats['spokeo_age']) & (stats['been_age']!=stats['radaris_age'])\n",
    "      & (stats['threesources']==1) & (stats['foursources']==0),1, stats['sbr_disagree_all'])\n",
    "\n",
    "stats['sbp_disagree_all']= 0\n",
    "stats['sbp_disagree_all'] = np.where((stats['peoplefinders_age']>0) &(stats['been_age']>0) &(stats['spokeo_age']>0) &(stats['peoplefinders_age']!=stats['spokeo_age']) & (stats['been_age']!=stats['spokeo_age']) & (stats['been_age']!=stats['peoplefinders_age'])\n",
    "      & (stats['threesources']==1) & (stats['foursources']==0),1, stats['sbp_disagree_all'])\n",
    "\n",
    "stats['brp_disagree_all']= 0\n",
    "stats['brp_disagree_all'] = np.where((stats['radaris_age']>0) &(stats['been_age']>0) &(stats['peoplefinders_age']>0) &(stats['radaris_age']!=stats['peoplefinders_age']) & (stats['been_age']!=stats['peoplefinders_age']) & (stats['been_age']!=stats['radaris_age'])\n",
    "      & (stats['threesources']==1) & (stats['foursources']==0),1, stats['brp_disagree_all'])\n",
    "\n",
    "stats['spr_disagree_all']= 0\n",
    "stats['spr_disagree_all'] = np.where((stats['radaris_age']>0) &(stats['peoplefinders_age']>0) &(stats['spokeo_age']>0) &(stats['radaris_age']!=stats['spokeo_age']) & (stats['peoplefinders_age']!=stats['spokeo_age']) & (stats['peoplefinders_age']!=stats['radaris_age'])\n",
    "      & (stats['threesources']==1) & (stats['foursources']==0),1, stats['spr_disagree_all'])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "# finding differences when there are only 3 sources and 2 disagree in age\n",
    "stats['1/3diff'] = 0\n",
    "\n",
    "stats['1/3diff'] = np.where((stats['sb_agree']==1) & (stats['threesources']==1) & (stats['foursources']==0) & (stats['radaris_age']>0) , np.abs(stats['spokeo_age']-stats['radaris_age']), stats['1/3diff'])\n",
    "stats['1/3diff'] = np.where((stats['sb_agree']==1) & (stats['threesources']==1) & (stats['foursources']==0)& (stats['peoplefinders_age']>0), np.abs(stats['spokeo_age']-stats['peoplefinders_age']), stats['1/3diff'])\n",
    "\n",
    "stats['1/3diff'] = np.where((stats['sr_agree']==1) & (stats['threesources']==1) & (stats['foursources']==0)& (stats['peoplefinders_age']>0) , np.abs(stats['spokeo_age']-stats['peoplefinders_age']), stats['1/3diff'])\n",
    "stats['1/3diff'] = np.where((stats['sr_agree']==1) & (stats['threesources']==1) & (stats['foursources']==0)& (stats['been_age']>0) , np.abs(stats['spokeo_age']-stats['been_age']), stats['1/3diff'])\n",
    "\n",
    "stats['1/3diff'] = np.where((stats['br_agree']==1) & (stats['threesources']==1) & (stats['foursources']==0)& (stats['spokeo_age']>0) , np.abs(stats['been_age']-stats['spokeo_age']), stats['1/3diff'])\n",
    "stats['1/3diff'] = np.where((stats['br_agree']==1) & (stats['threesources']==1) & (stats['foursources']==0)& (stats['peoplefinders_age']>0) , np.abs(stats['been_age']-stats['peoplefinders_age']), stats['1/3diff'])\n",
    "\n",
    "stats['1/3diff'] = np.where((stats['sp_agree']==1) & (stats['threesources']==1) & (stats['foursources']==0)& (stats['been_age']>0) , np.abs(stats['spokeo_age']-stats['been_age']), stats['1/3diff'])\n",
    "stats['1/3diff'] = np.where((stats['sp_agree']==1) & (stats['threesources']==1) & (stats['foursources']==0)& (stats['radaris_age']>0) , np.abs(stats['spokeo_age']-stats['radaris_age']), stats['1/3diff'])\n",
    "\n",
    "stats['1/3diff'] = np.where((stats['bp_agree']==1) & (stats['threesources']==1) & (stats['foursources']==0)& (stats['spokeo_age']>0) , np.abs(stats['spokeo_age']-stats['been_age']), stats['1/3diff'])\n",
    "stats['1/3diff'] = np.where((stats['bp_agree']==1) & (stats['threesources']==1) & (stats['foursources']==0)& (stats['radaris_age']>0) , np.abs(stats['been_age']-stats['radaris_age']), stats['1/3diff'])\n",
    "\n",
    "stats['1/3diff'] = np.where((stats['rp_agree']==1) & (stats['threesources']==1) & (stats['foursources']==0)& (stats['spokeo_age']>0) , stats['spokeo_age']-stats['radaris_age'], stats['1/3diff'])\n",
    "stats['1/3diff'] = np.where((stats['rp_agree']==1) & (stats['threesources']==1) & (stats['foursources']==0)& (stats['been_age']>0) , stats['been_age']-stats['radaris_age'], stats['1/3diff'])\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "#dummy for 3 sources found but all disagree\n",
    "stats['3sourcediff'] = np.where((stats['sbr_disagree_all']==1) | (stats['sbp_disagree_all']==1)  | (stats['brp_disagree_all']== 1) | (stats['spr_disagree_all']==1),1, 0)\n",
    "stats['oneage']= np.where(stats['3sourcediff']==1, -3,stats['oneage'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "#dummy for 4 sources found but all disagree\n",
    "stats['4sourcediff'] = np.where((stats['foursources']==1) & (stats['spokeo_age']!=stats['been_age']) &  (stats['spokeo_age']!=stats['radaris_age'])\n",
    "                                & (stats['spokeo_age']!=stats['peoplefinders_age']) & (stats['been_age']!=stats['peoplefinders_age'])\n",
    "                                & (stats['been_age']& stats['radaris_age']) & (stats['radaris_age']!=stats['peoplefinders_age']),1,0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "# age differences when all three disagree\n",
    "stats['sp_sprdiff'] = np.where(stats['spr_disagree_all'] == 1, np.abs(stats['spokeo_age']-stats['peoplefinders_age']), np.nan)\n",
    "stats['sr_sprdiff'] = np.where(stats['spr_disagree_all'] == 1, np.abs(stats['spokeo_age']-stats['radaris_age']), np.nan)\n",
    "stats['pr_sprdiff'] = np.where(stats['spr_disagree_all'] == 1, np.abs(stats['peoplefinders_age']-stats['radaris_age']), np.nan)\n",
    "\n",
    "stats['bp_brpdiff'] = np.where(stats['brp_disagree_all'] == 1, np.abs(stats['been_age']-stats['peoplefinders_age']), np.nan)\n",
    "stats['br_brpdiff'] = np.where(stats['brp_disagree_all'] == 1, np.abs(stats['been_age']-stats['radaris_age']), np.nan)\n",
    "stats['pr_brpdiff'] = np.where(stats['brp_disagree_all'] == 1, np.abs(stats['peoplefinders_age']-stats['radaris_age']), np.nan)\n",
    "\n",
    "stats['sp_sbpdiff'] = np.where(stats['sbp_disagree_all'] == 1, np.abs(stats['spokeo_age']-stats['peoplefinders_age']), np.nan)\n",
    "stats['sb_sbpdiff'] = np.where(stats['sbp_disagree_all'] == 1, np.abs(stats['spokeo_age']-stats['been_age']), np.nan)\n",
    "stats['bp_sbpdiff'] = np.where(stats['sbp_disagree_all'] == 1, np.abs(stats['peoplefinders_age']-stats['been_age']), np.nan)\n",
    "\n",
    "stats['sb_sbrdiff'] = np.where(stats['sbr_disagree_all'] == 1, np.abs(stats['spokeo_age']-stats['been_age']), np.nan)\n",
    "stats['br_sbrdiff'] = np.where(stats['sbr_disagree_all'] == 1, np.abs(stats['been_age']-stats['radaris_age']), np.nan)\n",
    "stats['sr_sbrdiff'] = np.where(stats['sbr_disagree_all'] == 1, np.abs(stats['spokeo_age']-stats['radaris_age']), np.nan)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### When disagreement is less than 3 years, create indicator variable"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "#replace age where the differences is less than 3 years between all the sources\n",
    "stats['oneage'] = np.where((stats['brp_disagree_all']==1) & (stats['bp_brpdiff']<3) & (stats['bp_brpdiff']<3) & (stats['pr_brpdiff']<3), stats['been_age'],stats['oneage'])\n",
    "stats['oneage'] = np.where((stats['spr_disagree_all']==1) & (stats['sp_sprdiff']<3) & (stats['sr_sprdiff']<3) & (stats['pr_sprdiff']<3), stats['spokeo_age'],stats['oneage'])\n",
    "stats['oneage'] = np.where((stats['sbp_disagree_all']==1) & (stats['sp_sbpdiff']<3) & (stats['sb_sbpdiff']<3) & (stats['bp_sbpdiff']<3), stats['been_age'],stats['oneage'])\n",
    "stats['oneage'] = np.where((stats['sbr_disagree_all']==1) & (stats['sb_sbrdiff']<3) & (stats['br_sbrdiff']<3) & (stats['sr_sbrdiff']<3), stats['radaris_age'],stats['oneage'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "# difference between 2 sources\n",
    "stats['within_r_s'] = np.abs(stats['radaris_age'] - stats['spokeo_age'])\n",
    "stats['within_r_b'] = np.abs(stats['radaris_age'] - stats['been_age'])\n",
    "stats['within_s_b'] = np.abs(stats['spokeo_age'] - stats['been_age'])\n",
    "stats['within_s_p'] = np.abs(stats['spokeo_age'] - stats['peoplefinders_age'])\n",
    "stats['within_b_p'] = np.abs(stats['been_age'] - stats['peoplefinders_age'])\n",
    "stats['within_r_p'] = np.abs(stats['radaris_age'] - stats['peoplefinders_age'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "#two sources differ, but the differences is less than 3\n",
    "stats['2sourcenodiff'] = np.where((stats['within_r_s']<3)&(stats['none']==0) & (stats['threesources']==0)\n",
    "                                  & (stats['foursources']==0) & (stats['foursources']==0)\n",
    "                                  & (stats['3agree']==0)&(stats['onesource']==0),1,0)\n",
    "\n",
    "stats['2sourcenodiff'] = np.where((stats['within_r_b']<3)&(stats['none']==0) & (stats['threesources']==0) \n",
    "                                  & (stats['foursources']==0) & (stats['4agree']==0)                                  \n",
    "                                  & (stats['3agree']==0)&(stats['onesource']==0),1,0)\n",
    "\n",
    "stats['2sourcenodiff'] = np.where((stats['within_s_b']<3)&(stats['none']==0) & (stats['threesources']==0)\n",
    "                                  & (stats['foursources']==0) & (stats['4agree']==0) \n",
    "                                  & (stats['3agree']==0)&(stats['onesource']==0),1,0)\n",
    "\n",
    "stats['2sourcenodiff'] = np.where((stats['within_s_p']<3)&(stats['none']==0) & (stats['threesources']==0)\n",
    "                                  & (stats['foursources']==0) & (stats['4agree']==0) \n",
    "                                  & (stats['3agree']==0)&(stats['onesource']==0),1,0)\n",
    "\n",
    "stats['2sourcenodiff'] = np.where((stats['within_b_p']<3)&(stats['none']==0) & (stats['threesources']==0)\n",
    "                                  & (stats['foursources']==0) & (stats['4agree']==0) \n",
    "                                  & (stats['3agree']==0)&(stats['onesource']==0),1,0)\n",
    "\n",
    "stats['2sourcenodiff'] = np.where((stats['within_r_p']<3)&(stats['none']==0) & (stats['threesources']==0)\n",
    "                                  & (stats['foursources']==0) & (stats['4agree']==0) \n",
    "                                  & (stats['3agree']==0)&(stats['onesource']==0),1,0)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "#replace age for two sources when differ is less than 3\n",
    "stats['oneage'] = np.where((stats['2sourcenodiff']==1) & (stats['spokeo_age']>0), stats['spokeo_age'],stats['oneage'])\n",
    "stats['oneage'] = np.where((stats['2sourcenodiff']==1) & (stats['peoplefinders_age']>0), stats['peoplefinders_age'],stats['oneage'])\n",
    "stats['oneage'] = np.where((stats['2sourcenodiff']==1) & (stats['radaris_age']>0), stats['radaris_age'],stats['oneage'])\n",
    "stats['oneage'] = np.where((stats['2sourcenodiff']==1) & (stats['been_age']>0), stats['been_age'],stats['oneage'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "# checking for age discrepencies across multiple searches\n",
    "\n",
    "#Counting number of inventors that have different ages within a source\n",
    "searched = stats[['radaris_age','inventor_id']].groupby('inventor_id').count().reset_index().rename(columns={'radaris_age':'searched'})\n",
    "stats = pd.merge(stats, searched, on='inventor_id', how='left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-37-d622af556c03>:4: UserWarning: Boolean Series key will be reindexed to match DataFrame index.\n",
      "  within_r = within[['radaris_age','inventor_id']].drop_duplicates()[within['radaris_age']>1].groupby(['inventor_id']).count().reset_index()\n",
      "<ipython-input-37-d622af556c03>:5: UserWarning: Boolean Series key will be reindexed to match DataFrame index.\n",
      "  within_s = within[['spokeo_age','inventor_id']].drop_duplicates()[within['spokeo_age']>1].groupby(['inventor_id']).count().reset_index()\n",
      "<ipython-input-37-d622af556c03>:6: UserWarning: Boolean Series key will be reindexed to match DataFrame index.\n",
      "  within_b = within[['been_age','inventor_id']].drop_duplicates()[within['been_age']>1].groupby(['inventor_id']).count().reset_index()\n",
      "<ipython-input-37-d622af556c03>:7: UserWarning: Boolean Series key will be reindexed to match DataFrame index.\n",
      "  within_p = within[['peoplefinders_age','inventor_id']].drop_duplicates()[within['peoplefinders_age']>1].groupby(['inventor_id']).count().reset_index()\n"
     ]
    }
   ],
   "source": [
    "#removing inventors that only have one inventor-location pair\n",
    "within = stats[stats['searched']>1]\n",
    "#within each webdirectors, as long as an age is found, counting the number of ages found\n",
    "within_r = within[['radaris_age','inventor_id']].drop_duplicates()[within['radaris_age']>1].groupby(['inventor_id']).count().reset_index()\n",
    "within_s = within[['spokeo_age','inventor_id']].drop_duplicates()[within['spokeo_age']>1].groupby(['inventor_id']).count().reset_index()\n",
    "within_b = within[['been_age','inventor_id']].drop_duplicates()[within['been_age']>1].groupby(['inventor_id']).count().reset_index()\n",
    "within_p = within[['peoplefinders_age','inventor_id']].drop_duplicates()[within['peoplefinders_age']>1].groupby(['inventor_id']).count().reset_index()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "# inventors that have more than one age found within the source\n",
    "within_r = within_r[within_r['radaris_age']>1]\n",
    "within_s = within_s[within_s['spokeo_age']>1]\n",
    "within_b = within_b[within_b['been_age']>1]\n",
    "within_p = within_p[within_p['peoplefinders_age']>1]\n",
    "\n",
    "#list of inventors with multiple searches\n",
    "across_r= within_r['inventor_id'].unique().tolist()\n",
    "across_s = within_s['inventor_id'].unique().tolist()\n",
    "across_b = within_b['inventor_id'].unique().tolist()\n",
    "across_p = within_p['inventor_id'].unique().tolist()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "across = across_r+across_s+across_b+across_p\n",
    "#if across==1 then that inventor has multiple age searches\n",
    "stats['across'] = np.where(stats['inventor_id'].isin(across),1,0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "59671"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#Number of inventors with multiple age searches\n",
    "c = stats[stats['across']==1]\n",
    "c['inventor_id'].nunique()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Finalized ages\n",
    "NOTE: this does not include patent information. Further cleaning is reccommended utilizing patent information to see if individuals patent before birthdate (or some age) or may be too old to realistically patent."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [],
   "source": [
    "#overall dataset\n",
    "#only include ages determined to be within reason\n",
    "overall = stats[(stats['oneage']>0)]\n",
    "# check if there are disagreeing ages within inventor id \n",
    "overall = overall[['inventor_id','oneage']].drop_duplicates().groupby('inventor_id').count().reset_index()\n",
    "ambig = overall[overall['oneage']>1]\n",
    "#ids of those who have more than one age assigned\n",
    "ambig_id = ambig['inventor_id'].unique()\n",
    "#dataset with only agreeing data\n",
    "agree_data = stats[~stats['inventor_id'].isin(ambig_id)]\n",
    "agree_inventorid = agree_data['inventor_id'].unique()\n",
    "agree_data = agree_data[['inventor_id','oneage']].drop_duplicates()\n",
    "#if we don't have the age, mark it as missing\n",
    "agree_data['age'] = agree_data['oneage']\n",
    "agree_data['age'] = np.where(agree_data['age']<1, np.nan, agree_data['age'])\n",
    "# average age across dataset\n",
    "agree_data= agree_data[['inventor_id','age']].drop_duplicates().groupby('inventor_id').mean().round(0).reset_index().rename(\n",
    "    columns={'age':'av_age'})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [],
   "source": [
    "#merging ages with inventor\n",
    "inventor_age = pd.merge(inventor, agree_data, on='inventor_id', how='left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [],
   "source": [
    "#replace all negative values of age with missing if it's negative\n",
    "inventor_age['radaris_age'] = np.where(inventor_age['radaris_age']<0, np.nan, inventor_age['radaris_age'])\n",
    "inventor_age['spokeo_age'] = np.where(inventor_age['spokeo_age']<0, np.nan, inventor_age['spokeo_age'])\n",
    "inventor_age['been_age'] = np.where(inventor_age['been_age']<0, np.nan, inventor_age['been_age'])\n",
    "inventor_age['peoplefinders_age'] = np.where(inventor_age['peoplefinders_age']<0, np.nan, inventor_age['peoplefinders_age'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [],
   "source": [
    "# change information from age to birthyear\n",
    "inventor_age['birthyear'] = 2019 - inventor_age['av_age']\n",
    "inventor_age['radaris_birthyear'] = 2019 - inventor_age['radaris_age']\n",
    "inventor_age['spokeo_birthyear'] = 2019 - inventor_age['spokeo_age']\n",
    "inventor_age['been_birthyear'] = 2019 - inventor_age['been_age']\n",
    "inventor_age['peoplefinders_birthyear'] = 2019 - inventor_age['peoplefinders_age']\n",
    "\n",
    "by = ['spokeo_birthyear','radaris_birthyear','been_birthyear','peoplefinders_birthyear','birthyear']\n",
    "for i in by:\n",
    "    inventor_age[i] = inventor_age[i].round(0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [],
   "source": [
    "inventor_age = inventor_age.drop(['radaris_age','av_age','spokeo_age','been_age','peoplefinders_age','patent_id'],1).drop_duplicates()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>inventor_id</th>\n",
       "      <th>birthyear</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Empty DataFrame\n",
       "Columns: [inventor_id, birthyear]\n",
       "Index: []"
      ]
     },
     "execution_count": 46,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#sanity check\n",
    "sanity_ia = inventor_age[['inventor_id','birthyear']].groupby('inventor_id').nunique().reset_index()\n",
    "sanity_ia[sanity_ia['birthyear']>1]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Add Gender"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [],
   "source": [
    "#adding in Gender\n",
    "#more comprehensive search of gender\n",
    "gender1 = pd.read_csv('newinventor_no_birthYear.csv').drop('prob_male_min',1)\n",
    "gender2 = pd.read_csv('newinventor_with_birthYearNA.csv')\n",
    "gender3 = pd.read_csv('newinventor_with_birthYear.csv')\n",
    "gender1 = gender1.rename(columns = {'Min_Prob_Male':'prob_male_min'})\n",
    "gender1 = gender1[['inventor_id','prob_male_min']].drop_duplicates()\n",
    "#Defining the probability to be at least 85% to be a certain gender\n",
    "gender1['gender'] = np.where(gender1['prob_male_min']>.85, 'M' , np.where(gender1['prob_male_min']<.16, 'F', np.nan))\n",
    "gender2 = gender2[['inventor_id','Min_Prob_Male']].rename(\n",
    "            columns = {'Min_Prob_Male':'prob_male_min'}).drop_duplicates()\n",
    "gender2['gender'] = np.where(gender2['prob_male_min']>.85, 'M' , np.where(gender2['prob_male_min']<.16, 'F', np.nan))\n",
    "gender3 = gender3[['inventor_id','prob_male_min']].drop_duplicates()\n",
    "gender3['gender'] = np.where(gender3['prob_male_min']>.85, 'M' , np.where(gender3['prob_male_min']<.16, 'F', np.nan))\n",
    "gender = gender1.append(gender2)\n",
    "gender = gender.append(gender3)\n",
    "gender = gender.dropna().drop_duplicates()\n",
    "# Gender search results via method by Blevins and Lincoln (DHQ, 2015), see dataset description documnet for details"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [],
   "source": [
    "#checking to see that there are no cases of more than one gender \n",
    "#this is only for when we can identify the inventor id\n",
    "gen_un = gender[['inventor_id','gender']].drop_duplicates().groupby('inventor_id').count().reset_index()\n",
    "gen_mix= gen_un[gen_un['gender']>1]\n",
    "gen_mix = gen_mix['inventor_id'].unique()\n",
    "#where there is more than one gender assigned, replace it with unknown\n",
    "gender['gender'] = np.where(gender.inventor_id.isin(gen_mix), 'U', gender['gender'])\n",
    "gender = gender.drop_duplicates()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>inventor_id</th>\n",
       "      <th>gender</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Empty DataFrame\n",
       "Columns: [inventor_id, gender]\n",
       "Index: []"
      ]
     },
     "execution_count": 49,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#checking to see that there are no cases of more than one gender \n",
    "gen_un = gender[['inventor_id','gender']].drop_duplicates().groupby('inventor_id').count().reset_index()\n",
    "gen_mix= gen_un[gen_un['gender']>1]\n",
    "gen_mix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [],
   "source": [
    "inventor_age = pd.merge(inventor_age,gender, on='inventor_id',how ='left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [],
   "source": [
    "inventor_age['gender'] = inventor_age['gender'].replace('nan', 'U')\n",
    "inventor_age['gender'] = inventor_age['gender'].replace(np.nan, 'U')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['M', 'U', 'F'], dtype=object)"
      ]
     },
     "execution_count": 52,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#sanity check\n",
    "inventor_age['gender'].unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "unique number of inventors 1858516\n"
     ]
    }
   ],
   "source": [
    "#checking number of inventors\n",
    "print('unique number of inventors', inventor_age['inventor_id'].nunique())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 116,
   "metadata": {},
   "outputs": [],
   "source": [
    "inventor_age.to_csv('inventor_age_score_gender.csv',index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Inventor IDs and Patent IDs with application year\n",
    "Create the file 'inventorid_patentid_crosswalk.csv'\n",
    "\n",
    "- datereview.csv\n",
    "- record.csv\n",
    "- wrong_states.csv\n",
    "- wrong_dates_mk.csv\n",
    "- application.tsv\n",
    "- apldate_error_complete.csv\n",
    "- apldate_error_below_editedKN_complete.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/Users/mkaltenberg/Dropbox/Inventor_age/replication_data/patent_inventor_year\n"
     ]
    }
   ],
   "source": [
    "cd \"your/folder/here/\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/anaconda3/lib/python3.8/site-packages/IPython/core/interactiveshell.py:3145: DtypeWarning: Columns (4) have mixed types.Specify dtype option on import or set low_memory=False.\n",
      "  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,\n"
     ]
    }
   ],
   "source": [
    "inv = pd.read_csv('record.csv')\n",
    "inv = inv.rename(columns={'patent_processing_time':'grant_date', 'patent_date': 'patenting_processing_days'})\n",
    "dates = inv[['patent_id','grant_date','app_date']].drop_duplicates()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/anaconda3/lib/python3.8/site-packages/IPython/core/interactiveshell.py:3337: DtypeWarning: Columns (2,3) have mixed types.Specify dtype option on import or set low_memory=False.\n",
      "  if (await self.run_code(code, result,  async_=asy)):\n"
     ]
    }
   ],
   "source": [
    "app = pd.read_table('application.tsv')\n",
    "inventor = pd.merge(inventor,app[['patent_id','date']].drop_duplicates(), on = 'patent_id', how='left')\n",
    "inventor = pd.merge(inventor,dates, on = 'patent_id', how='left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [],
   "source": [
    "#application date changes\n",
    "cols = range(0,49)\n",
    "appl_date_below =  pd.read_csv(\"apldate_error_below_editedKN_complete.csv\", header = 3, usecols = cols)\n",
    "# manually edited dates corrected, see dataset description document for details\n",
    "appl_date_below['appl_manual'] = np.where(appl_date_below['appl_manual'] == '-1', None, appl_date_below['appl_manual'])\n",
    "#replace missing with nans\n",
    "appl_date_below=appl_date_below[appl_date_below['appl_manual'].notnull()]\n",
    "#only include rows where we have info in appl_manual\n",
    "appl_date_above = pd.read_csv(\"apldate_error_complete.csv\", header = 2, usecols = cols)\n",
    "# manually edited dates corrected, see dataset description document for details\n",
    "appl_date_above['appl_manual'] = np.where(appl_date_above['appl_manual'] == '-1', None, appl_date_above['appl_manual'])\n",
    "#replace missing with nans\n",
    "appl_date_above=appl_date_above[appl_date_above['appl_manual'].notnull()]\n",
    "#only include rows where we have info in appl_manual"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [],
   "source": [
    "#merging cleaned application dates\n",
    "inventor = pd.merge(inventor, appl_date_above[['a_patent_id','appl_manual']].drop_duplicates(\n",
    "    ), right_on = ['a_patent_id'], left_on = ['patent_id'], how = 'left').drop_duplicates()\n",
    "inventor['app_date'] = np.where(inventor['appl_manual'].notnull(), inventor['appl_manual'], inventor['app_date'])\n",
    "inventor = inventor.drop(['appl_manual','a_patent_id'],1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Adding year of invention\n",
    "inventor['app_date']= inventor['app_date'].replace('1078-08-03','1978-08-03')\n",
    "inventor['app_date']= inventor['app_date'].replace('1096-01-18','1996-01-18')\n",
    "inventor['app_date']= inventor['app_date'].replace('1075-03-04','1975-03-04')\n",
    "inventor['app_date']= inventor['app_date'].replace('1078-08-03','1978-08-03')\n",
    "inventor['app_date']= inventor['app_date'].replace('1074-10-29','1974-10-29')\n",
    "inventor['appl_year'] = inventor['app_date'].str[:4]\n",
    "inventor['grant_year'] = inventor['grant_date'].str[:4]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "8385601"
      ]
     },
     "execution_count": 61,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dates = pd.read_csv('wrong_dates_mk.csv')\n",
    "dates['applyear_c'] = dates['appldate_c'].str[-4:]\n",
    "inventor = pd.merge(inventor,dates[['a_patent_id','applyear_c']].drop_duplicates(), left_on = [\n",
    "    'patent_id'], right_on = ['a_patent_id'], how = 'left')\n",
    "inventor['appl_year'] = np.where(inventor['applyear_c'].notnull(), inventor['applyear_c'], inventor['appl_year'])\n",
    "inventor = inventor.drop('applyear_c',1)\n",
    "len(inventor)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "8385601"
      ]
     },
     "execution_count": 62,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "datereview = pd.read_csv('datereview.csv')\n",
    "datereview['applyear_c'] = datereview['app_date_updated'].str[:4]\n",
    "inventor = pd.merge(inventor,datereview[['patent_id','applyear_c']].drop_duplicates(), \n",
    "                    on = ['patent_id'], how = 'left').drop_duplicates()\n",
    "inventor['appl_year'] = np.where(inventor['applyear_c'].notnull(), inventor['applyear_c'], inventor['appl_year'])\n",
    "inventor = inventor.drop(['applyear_c'],1)\n",
    "len(inventor)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [],
   "source": [
    "inventor['appl_year'] = inventor['appl_year'].fillna(0).astype(int)\n",
    "inventor['grant_year'] = inventor['grant_year'].astype(int)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 147,
   "metadata": {},
   "outputs": [],
   "source": [
    "inventor[['patent_id','inventor_id','grant_year','appl_year']].drop_duplicates().to_csv('inventorid_patentid_crosswalk.csv',index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>grant_year</th>\n",
       "      <th>appl_year</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>8385601.000</td>\n",
       "      <td>8385601.000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>2004.846</td>\n",
       "      <td>2002.141</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>10.914</td>\n",
       "      <td>11.254</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>1976.000</td>\n",
       "      <td>0.000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>1998.000</td>\n",
       "      <td>1996.000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>2008.000</td>\n",
       "      <td>2004.000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>2014.000</td>\n",
       "      <td>2011.000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>2018.000</td>\n",
       "      <td>2018.000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        grant_year    appl_year\n",
       "count  8385601.000  8385601.000\n",
       "mean      2004.846     2002.141\n",
       "std         10.914       11.254\n",
       "min       1976.000        0.000\n",
       "25%       1998.000     1996.000\n",
       "50%       2008.000     2004.000\n",
       "75%       2014.000     2011.000\n",
       "max       2018.000     2018.000"
      ]
     },
     "execution_count": 64,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "inventor[['patent_id','inventor_id','grant_year','appl_year']].describe().round(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "unique number of patents 3648663\n",
      "unique number of inventors 1858516\n"
     ]
    }
   ],
   "source": [
    "# overview\n",
    "print('unique number of patents',inventor['patent_id'].nunique())\n",
    "print('unique number of inventors', inventor['inventor_id'].nunique())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
